---
title: "Replicação do artigo {censobr}: Easy Access to Brazilian Population Census Data, de Rafael Pereira e Rogério Barbosa"
author: "Revista DADOS"
date: "2025-11-03"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE,
                      warning = FALSE,
                      message = FALSE)

library(censobr)
library(tidyverse)
library(geobr)


#options(arrow.unsafe_metadata = TRUE) 



```

Additional info: https://github.com/ipeaGIT/censobr

# Data dictionary

```{r}
# dictionary of variables: population microdata
data_dictionary(year = 2010, dataset = 'population')

# dictionary of variables: household microdata
data_dictionary(year = 2010, dataset = 'households')

# dictionary of census tract variables (aggregate data)
data_dictionary(year = 2010, dataset = 'tracts')
```
# Questionnaires

```{r}
# short questionnaire for the universe survey
questionnaire(year = 2022, type = 'short')

# long questionnaire for the sample survey
questionnaire(year = 2022, type = 'long')

  
  
```

# Interviewer’s Manual

```{r}
# 2022 Census
interview_manual(year = 2022)

# 1960 Census
interview_manual(year = 1960)

```

# Handling larger than memory data

```{r}
# Read 2010 mortality data
mortality_2010 <- censobr::read_mortality(year = 2010, add_labels = 'pt')

# Filter deaths of men in the state of Rio de Janeiro
rio <- mortality_2010 |>
  filter(V0704 == 'Masculino' & abbrev_state == 'RJ')

# Collect the data, loading it into the memory as a data.frame
rio_df <- rio |> dplyr::collect()
```





```{r}
library(duckdb)
library(DBI)

# Read 2010 mortality data
mortality_2010 <- censobr::read_mortality(year = 2010, add_labels = 'pt')

# Create a database connection
con <- duckdb::dbConnect(duckdb::duckdb())

# Register the Arrow table in the database
duckdb::duckdb_register_arrow(con, 'mortality_2010_tbl', mortality_2010)

# Execute an SQL query to filter data
rio2 <- DBI::dbGetQuery(con, "SELECT * FROM 'mortality_2010_tbl' WHERE V0704 LIKE '%Masculino%' AND abbrev_state = 'RJ'")
```


# Population Data: Making Age Pyramids


```{r}
# Load population data for 1970 and 2010
pop_1970 <- censobr::read_population(year = 1970)
pop_2010 <- censobr::read_population(year = 2010)
```


```{r}
# Summarize age distribution
age_dist_1970 <- pop_1970 |>
        mutate(age    = ifelse(V026 %in% c(3, 4), 
                               as.numeric(V027), 0),
               gender = ifelse(V023 == 0, "Men", "Women"),
               year   = 1970) |>
        group_by(age, gender, year) |>
        summarize(count = sum(V054)) |>
        dplyr::collect()

age_dist_2010 <- pop_2010 |>
        mutate(age    = as.numeric(V6036),
               gender = ifelse(V0601  == 1, "Men", "Women"),
               year   = 2010) |>
        group_by(age, gender, year) |>
        summarize(count = sum(V0010)) |>
        dplyr::collect()
```

```{r}
# Gathering, recoding, and aggregating by age groups
pyramid_df = bind_rows(age_dist_1970,
                       age_dist_2010) |>
        filter(!is.na(age)) |>
        mutate(count = ifelse(gender == "Men", count, -count),
               age_group = dplyr::case_when(
                       age <= 04            ~ "00-04",
                       age >= 05 & age <= 09 ~ "05-09",
                       age >= 10 & age <= 14 ~ "10-14",
                       age >= 15 & age <= 19 ~ "15-19",
                       age >= 20 & age <= 24 ~ "20-24",
                       age >= 25 & age <= 29 ~ "25-29",
                       age >= 30 & age <= 34 ~ "30-34",
                       age >= 35 & age <= 39 ~ "35-39",
                       age >= 40 & age <= 44 ~ "40-44",
                       age >= 45 & age <= 49 ~ "45-49",
                       age >= 50 & age <= 54 ~ "50-54",
                       age >= 55 & age <= 59 ~ "55-59",
                       age >= 60 & age <= 64 ~ "60-64",
                       age >= 65 & age <= 69 ~ "65-69",
                       age >= 70 & age <= 74 ~ "70-74",
                       age >= 75 & age <= 79 ~ "75-79",
                       age >= 80             ~ "80+")) |>
        count(year, gender, age_group, wt = count)
```



```{r}
# Plotting the figure
pyramid_df |>
        ggplot(aes(x = n / 1000,
                   y = age_group,
                   fill = gender)) +
        geom_col() +
        scale_fill_discrete(name="", type=c("#437297","#ffcb69")) +       
        scale_x_continuous(labels = function(x){scales::comma(abs(x))},
                           breaks = c(-8000, -4000,0,4000, 8000),
                           name = "Population (in thousands)") +
        facet_wrap(~year) +
        theme_classic() +
        theme(legend.position = "bottom",
              axis.title.y = element_blank(),
              panel.grid.major.x = element_line(color = "grey90"), 
              strip.background.x = element_blank()) 

```


# Household Data: Sanitation Conditions

```{r}
# Load household data for 2010
households_2010 <- read_households(year = 2010)

# Summarize the number of households with and without adequate sanitation 
sanitation_summary <- households_2010 |>
        mutate(region = trunc(V0001/10),
               sanitation_access = ifelse(V0207 %in% c(1, 2),
                                          "Adequate Sanitation",
                                          "Inadequate Sanitation")) |>
        group_by(region, sanitation_access) |>
        summarise(count = sum(V0010)) |> # weighted count
        group_by(region) |>
        mutate(percentage = count/sum(count)) |>
        collect() |>
        mutate(region = factor(region,
                               levels = 1:5,
                               labels = c("North", "Northest", 
                                          "Southest", "Center-West", "South"),
                               ordered = T))
```



```{r}
# Plotting sanitation access by region
sanitation_summary |>
  ggplot(aes(x = region, y = percentage, fill = sanitation_access)) +
  geom_col() +
  labs(x = "Region",
       y = "Proportion of households"
  ) +
  scale_y_continuous(labels = scales::percent) +
  scale_fill_manual(values = c("#437297", "#ffcb69")) +
  theme_minimal() +
        theme(legend.position = "bottom",
              legend.title = element_blank())

```



# Using Complex Sample Design In The 1960 Census


```{r}
# Load household data for 1960
census_1960 = read_population(year = 1960)

# Filtering cases, selecting and recoding varibles
census_rj_gb = census_1960 |>
        select(uf, Water = V105, 
               censobr_upa, censobr_usa, censobr_estrato, censobr_weight, 
               censobr_source) |>
        filter(uf %in% c(52, 54),
               !is.na(Water)) |>
        collect() |>
        mutate(uf    = ifelse(uf == 52, "Rio de Janeiro", "Guanabara"),
               Water = ifelse(Water %in% c(0, 9), 1, 0),
               Water = factor(Water, 
                              levels = c(0, 1),
                              labels = c("Other Forms", 
                                         "General Public Network"), 
                              ordered = T))

# Weighted (N) and unweighted cases (n)
census_rj_gb |>
        group_by(uf) |>
        summarise(N = sum(censobr_weight),
                  n = n())
```


```{r}
# Loading the package for handling survey data
library(srvyr)


options(survey.lonely.psu = "adjust")


# Creating the object of class survey.design (not a data.frame anymore)
census_rj_gb_svy <- census_rj_gb |>
  as_survey_design(ids    = c(censobr_upa, censobr_usa),
                   strata = censobr_estrato,
                   weight = censobr_weight,
                   nest   = T)
```

```{r}
# Using dplyr-like syntax to manipulate the survey object
water_table <- census_rj_gb_svy |>
        mutate(one = 1) |>
        group_by(uf, Water) |>
        summarize(pct = survey_mean(vartype = "ci"),
                  n = unweighted(n()))

# Plotting the results
ggplot(data = water_table,
       aes(x = uf, 
           y = pct, 
           fill = Water,
           ymax = pct_upp, 
           ymin = pct_low)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_errorbar(position = position_dodge(width = 0.9), width = 0.1) +
  scale_y_continuous(labels = scales::percent) +
  scale_fill_manual(values = c("#437297", "#ffcb69")) +
  labs(x = "Areas",
       y = "Proportion") +
  theme_minimal() +
  theme(legend.position = "bottom")
```



# Working With Census Tracts  

```{r}
# download aggregated data for census tracts: household variables
dom <- read_tracts(year = 2010,
                   dataset = 'Domicilio')

names(dom)[c(1:12,301:306)]

```



```{r}
# download the data
tract_basico <- read_tracts(year = 2010,
                            dataset = "Basico")

tract_income <- read_tracts(year = 2010,
                            dataset = "DomicilioRenda")

# select columns
tract_basico <- tract_basico |> select('code_tract','V002')
tract_income <- tract_income |> select('code_tract','V003')

# merge tables
tracts_df10 <- left_join(tract_basico, tract_income)

# calculate per capita income
tracts_df10 <- tracts_df10 |>
                mutate(income_pc = V003 / V002) |>
                collect()
```

```{r}
# download Sao Paulo municipality
muni_sp <- read_municipality(code_muni = 3550308,
                             year = 2010)

# download all tracts in São Paulo 
tracts_2010 <- geobr::read_census_tract(code_tract = "SP",
                                        year = 2010,
                                        simplified = FALSE,
                                        showProgress = FALSE)

```




```{r}
sp_tracts <- left_join(tracts_2010, tracts_df10, by = 'code_tract')

ggplot() +
  geom_sf(data = sp_tracts, aes(fill = ifelse(income_pc<10000,income_pc,10000)),
          color=NA) +
  geom_sf(data = muni_sp, color='gray10', fill=NA) +
  scale_fill_viridis_c(name = "Income per capita (R$ of 2010)",
                       na.value="white",
                       option = 'cividis',
                       breaks = c(0,  1e3, 4e3, 8e3, 1e4),
                       labels  = c('0', '1,000', '4,000', '8,000', '> 10,000')) +
  theme(panel.background = element_rect(fill = "white"),
        axis.text                      = element_blank(),
        axis.ticks                    = element_blank())
```





```{r}
# download municipal boundaries 
muni_bh <- read_municipality(code_muni = 3106200 ,
                             year = 2022)

# download preliminary data from the 2022 tracts
tracts_df22 <- read_tracts(year = 2022,
                          dataset = "Preliminares") |>
               filter(name_muni == 'Belo Horizonte') |>
               collect()

# download all tracts in Minas Gerais
tracts_2022 <- geobr::read_census_tract(code_tract = "MG",
                                        year = 2022,
                                        simplified = FALSE)

# filter tracts in Belo Horizonte
tracts_2022 <- filter(tracts_2022, name_muni == 'Belo Horizonte')
```



```{r}
# merge tables
tracts_df22$code_tract <- as.numeric(tracts_df22$code_tract)
bh_tracts22 <- left_join(tracts_2022, tracts_df22, by = 'code_tract')

# calculate the area of the tracts
bh_tracts22 <- bh_tracts22 |>
              mutate(tract_aream2 = sf::st_area(tracts_2022),
                     tract_areakm2 = units::set_units(tract_aream2, km2))

# calculate population density
bh_tracts22 <- bh_tracts22 |>
               mutate(pop_km2 = as.numeric(V0001/ tract_areakm2))

# map
ggplot() +
  geom_sf(data = bh_tracts22, color=NA,
          aes(fill = ifelse(pop_km2<20000,pop_km2,20000))) +
  geom_sf(data = muni_bh, color='gray10', fill=NA) +
  scale_fill_distiller(palette = "Reds", direction = 1,
                       name='Population per'~Km^2,
                       breaks = c(0,  5e3, 10e3, 15e3, 2e4),
                       labels  = c('0', '5,000', 
                                   '10,000', '15,000', '> 20,000')) +
  theme(panel.background = element_rect(fill = "white"),
        axis.text        = element_blank(),
        axis.ticks       = element_blank())
```

